import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
from scipy.stats import chi2_contingency, ranksums
from pandas import Series, DataFrame
from scipy import stats
# if matplotlib is not set inline, you will not see plots
#alternatives auto gtk gtk2 inline osx qt qt5 wx tk
#%matplotlib auto
#%matplotlib qt
%matplotlib inline
# path = "datos.csv"
# df = pd.read_csv(path)
from azureml import Workspace
ws = Workspace()
ds = ws.datasets['ERDatos.csv']
df = ds.to_dataframe()
df = pd.DataFrame(data=df)
# df = df.replace({' \(Muy de acuerdo\)': ''}, regex=True)
# df = df.replace({' \(Muy en desacuerdo\)': ''}, regex=True)
df.head()
df.dtypes
numList = [
"Age",
"LikeGames",
"EasyIWEB",
"LearningEffectiveness",
"Engagement",
"Difficulty",
"Organisation",
"PreferOverLab",
"LearnMoreThanLab",
]
for str in numList:
df[str] = df[str].replace("NS/NC", None)
mean = df[df[str] != None][str].apply(lambda x: float(x)).mean()
df[str] = df[str].replace("NS/NC", mean)
df[str] = df[str].apply(lambda x: float(x))
df.describe(include="all").T
def checkHypotheses(name, p_val):
print(name)
print(" The p value is", p_val)
if p_val < 0.05 :
print(" The null hyphotheses is rejected: " +
"The two samples are statistically different")
else :
print(" Failed to reject the null hypotheses: " +
"The two samples are alike")
print('**************************************' +
'**************************************')
st = df.groupby(['GeneralOpinion', 'Sex'])['GeneralOpinion'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
st = df.groupby(['LikeGames', 'Sex'])['LikeGames'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
st = df.groupby(['Engagement', 'Sex'])['Engagement'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
womenLikeGames = df[df['Sex']=="Mujer"]['LikeGames']
menLikeGames = df[df['Sex']=="Hombre"]['LikeGames']
womenLikeER = df[df['Sex']=="Mujer"]['Engagement']
menLikeER = df[df['Sex']=="Hombre"]['Engagement']
womenEasy = df[df['Sex']=="Mujer"]['EasyIWEB']
menEasy = df[df['Sex']=="Hombre"]['EasyIWEB']
womenGeneral = df[df['Sex']=="Mujer"]['GeneralOpinion']
menGeneral = df[df['Sex']=="Hombre"]['GeneralOpinion']
womenKnowledge = df[df['Sex']=="Mujer"]['LearningEffectiveness']
menKnowledge = df[df['Sex']=="Hombre"]['LearningEffectiveness']
womenLevel = df[df['Sex']=="Mujer"]['Difficulty']
menLevel = df[df['Sex']=="Hombre"]['Difficulty']
womenOrganised = df[df['Sex']=="Mujer"]['Organisation']
menOrganised = df[df['Sex']=="Hombre"]['Organisation']
womenPrefer = df[df['Sex']=="Mujer"]['PreferOverLab']
menPrefer = df[df['Sex']=="Hombre"]['PreferOverLab']
womenLearn = df[df['Sex']=="Mujer"]['LearnMoreThanLab']
menLearn = df[df['Sex']=="Hombre"]['LearnMoreThanLab']
womenOther = df[df['Sex']=="Mujer"]['OtherSubjects']
menOther = df[df['Sex']=="Hombre"]['OtherSubjects']
womenRecommend = df[df['Sex']=="Mujer"]['Recommend']
menRecommend = df[df['Sex']=="Hombre"]['Recommend']
womenLikeGames.describe(include="all").T
menLikeGames.describe(include="all").T
z_stat, p_val = ranksums(menLikeER, womenLikeER)
checkHypotheses("Engagement",p_val)
z_stat, p_val = ranksums(menLikeGames, womenLikeGames)
checkHypotheses("Games",p_val)
z_stat, p_val = ranksums(menKnowledge, womenKnowledge)
checkHypotheses("Learning Effectiveness",p_val)
z_stat, p_val = ranksums(menEasy, womenEasy)
checkHypotheses("Easy IWEB",p_val)
z_stat, p_val = ranksums(menGeneral, womenGeneral)
checkHypotheses("General opinion",p_val)
z_stat, p_val = ranksums(menLevel, womenLevel)
checkHypotheses("Difficulty ER",p_val)
z_stat, p_val = ranksums(menOrganised, womenOrganised)
checkHypotheses("Organisation",p_val)
z_stat, p_val = ranksums(menPrefer, womenPrefer)
checkHypotheses("Prefer over lab",p_val)
z_stat, p_val = ranksums(menLearn, womenLearn)
checkHypotheses("Learning effectiveness over lab",p_val)
z_stat, p_val = ranksums(menRecommend, womenRecommend)
checkHypotheses("Recommend",p_val)
df.corr()
st = df.groupby(['LearningEffectiveness', 'Engagement'])['LearningEffectiveness'].count().unstack('Engagement').fillna(0)
st.plot(kind='bar', stacked=True)
np.corrcoef(df['LearningEffectiveness'], df['Engagement'])
THRESHOLD = 2
effective = df['LearningEffectiveness']>THRESHOLD
engaging = df['Engagement']>THRESHOLD
z_stat, p_val = ranksums(effective, engaging)
checkHypotheses("Are learning effectiveness and engagement equally distributed?",p_val)
np.corrcoef(effective, engaging)
THRESHOLD = 3
effective = df['LearningEffectiveness']>THRESHOLD
engaging = df['Engagement']>THRESHOLD
z_stat, p_val = ranksums(effective, engaging)
checkHypotheses("Are learning effectiveness and engagement equally distributed?",p_val)
np.corrcoef(effective, engaging)
st = df.groupby(['LearningEffectiveness', 'EasyIWEB'])['LearningEffectiveness'].count().unstack('EasyIWEB').fillna(0)
st.plot(kind='bar', stacked=True)
np.corrcoef(df['LearningEffectiveness'], df['EasyIWEB'])
THRESHOLD = 2
effective = df['LearningEffectiveness']>THRESHOLD
easy = df['EasyIWEB']>THRESHOLD
z_stat, p_val = ranksums(effective, easy)
checkHypotheses("Are learning effectiveness and opinion on IWEB level equally distributed?",p_val)
THRESHOLD = 3
effective = df['LearningEffectiveness']>THRESHOLD
easy = df['EasyIWEB']>THRESHOLD
z_stat, p_val = ranksums(effective, easy)
checkHypotheses("Are learning effectiveness and opinion on IWEB level equally distributed?",p_val)